** This dofile is to produce a dataset ready for analysis  

// Step 0: Make sure that potential share of mentees and mentor data are already available by running dofiles 4.0.1 and 4.0.4 

// Step 1: Extract unique student id from "$endline/grade10_endline_4Apr2022"
* Unique student id comprises of responseid_* and schoolname_*
* Data can be saved as "$clean/grade10_match_id.dta"

// Step 2: Clean baseline survey data and match with data from step 1
/* I have done some preliminary cleaning in "$dolfiles/0.3.build_grade910students_baseline.do" 
but please double-check that dofile by reading through each line
if there is no error, you can use the clean data set saved in "$clean/grade910_baseline_wgrade.dta" 
*/

// Step 3: Clean endline survey data and match with data from step 1 and 2
* (Some) code for cleaning is available in "$dofile/2.1.build_grade10students_endline.do"

// Step 4: Clean test score data 

// Step 5: Merge data from step 3 with data from step 0 and step 4

// Step 6: Create variables for final analysis 

/* Step 7: Merge current database with follow-up data on final outcomes:
		- current grade 11 administrative data 
		- TVET administrative data 
		- administrative data on repeaters
*/

* Output of this dofile is "$clean/grade10_analysis.dta"

* MAKE SURE TO RUN THE MASTER FILE FIRST! 



* ssc install unique
cap log close 
log using "$logfile/cleaning_grade10_$date.log", replace 
 
 
*******************************************************************************
*************************** step 1
clear
use "$endline/grade10_endline_4Apr2022.dta", clear
sort responseid_b schoolname_b 
rename geoprisize strata
keep responseid_b schoolname_b responseid_e schoolname_e treatstudent treatschool strata
g uniqueid = _n
lab var uniqueid "unique student ID created by researcher"
save "$clean/grade10_match_id.dta", replace

*******************************************************************************
*************************** step 2 
do "$dofiles/0.3.build_grade910students_baseline.do"





*******************************************************************************
*************************** step 3 

// 3.1 clean endline survey using 2.1
import delimited "$raw/student/grade910/20220404174118-SurveyExport.csv", varnames(1) encoding(UTF-8) clear 

// code from 2.1
* remove invalid responses
drop if thissurveyisintendedforgradexstu=="No, I am not a Grade X student."

* rename personal information variables
cap ren whatisyourname name_original_e
cap ren whatisyourgender sex
recode sex (2=0)
lab define sex 1 "Male" 0 "Female"
lab val sex sex 
cap ren whatisyourcitizenshipidcode11dig citizenid
format citizenid %15s
replace citizenid = subinstr(citizenid,".","",.)  
replace citizenid = subinstr(citizenid,"]","",.)  
replace citizenid = subinstr(citizenid,"/","",.) 
replace citizenid = trim(itrim(lower(citizenid))) 
replace citizenid = subinstr(citizenid,"na","",.) 
format citizenid %11s
replace citizenid ="" if inlist(citizenid, "00000000000","12345678910", "01234567890", "12345678902", "12345678901", "12345678912", "12345678911")
replace citizenid ="" if strlen(citizenid)~=11
cap ren inwhichdzongkhagisyourschoolloca district
cap ren dateofbirthdaywhatisyourdateofbi b_day
cap ren dateofbirthmonthwhatisyourdateof b_month
cap ren dateofbirthyearwhatisyourdateofb b_year
cap ren whatisyourstudentidcode studentid
cap ren whatisyouremailaddress email
cap ren whatisyourmobilephonenumber phone
replace district = strlower(district)

* format name
replace name_original_e=lower(name_original_e)
replace name_original_e = subinstr(name_original_e,"@gemailcom","",.)   //Removes irrelevant info
replace name_original_e = subinstr(name_original_e,"@gmailcom","",.)   //Removes irrelevant info
g name = name_original_e
replace name = trim(itrim(lower(name)))
replace name = subinstr(name," ","",.)  
replace name = subinstr(name,",","",.)   //Removes comma (,)
replace name = subinstr(name,"'","",.)   //Removes apostrophe (')
replace name = subinstr(name,".","",.)   //Removes dot (.) 
replace name = subinstr(name,"/","",.)   //Removes slash (/)
replace name = subinstr(name,"-","",.)   //Removes dash (-)
replace name = subinstr(name,"=","",.)   //Removes dash (-)
replace name = subinstr(name,"(","",.)   //Removes opening parentheses
replace name = subinstr(name,")","",.)   //Removes closing parentheses
replace name = subinstr(name,"mynameis","",.)   //Removes irrelevant info
replace name = subinstr(name,"@gmailcom","",.)   //Removes irrelevant info
replace name = subinstr(name,"@gemailcom","",.)   //Removes irrelevant info
format name %25s
format name_original_e %30s

* format studentid 
replace studentid = trim(itrim(lower(studentid)))
replace studentid = subinstr(studentid,"..",".",.)  
replace studentid = subinstr(studentid," ","",.)  
replace studentid = subinstr(studentid,"@education.gov.bt","",.)   //Removes irrelevant info
replace studentid = subinstr(studentid,"@education.gov. t","",.) 
replace studentid = subinstr(studentid,"ation.gov.bt","",.) 
replace studentid = subinstr(studentid,"@educ","",.) 
replace studentid = subinstr(studentid,"@.educ","",.) 
* br responseid studentid if strlen(studentid)~=17 & ~missing(studentid)
replace studentid="" if inlist(responseid, 1099, 1139, 2126, 7133, 7626) | studentid=="na"

* create a variable of school name
rename whatisthenameofyourschool schoolname
forval i=33/42 {
	replace schoolname = v`i' if missing(schoolname) & ~missing(v`i')
}	
replace schoolname = strlower(schoolname)

* for partial submission: keep reponses with sufficient information to determine students 
lab var status "survey status"
keep if status=="Complete" | ///
       (status=="Partial" & ///
	   (~((missing(name_original_e) | strlen(name_original_e)<=2) & missing(studentid)) | ///
	    ~((missing(name_original_e) | strlen(name_original_e)<=2) & missing(sex) & missing(b_month) & missing(b_year) & missing(b_day))))

* generate variables for treatment reports in the endline surveyed
g treat_claim = didyouparticipateinthementorship==1
lab var treat_claim "students claimed to have been participating in the program"

rename howmanytimesdidyoumeetyourmentor treat_meet_person
rename v121 treat_meet_online

rename overallsatisfactionwiththementor treat_satisfaction_mentor 
rename overallsatisfactionwiththeconten treat_satisfaction_program 

rename howlikelythatyouwillstayincontac treat_contact

rename doyouknowthatsomestudentsinyours untreated_tschool
recode untreated_tschool (2=0)
rename haveyoutalkedwithanyofyourclassm untreated_talk
rename ifthementoringhadbeenavailableto untreated_interest

rename didyouhearaboutthestudentsmentor control_cschool
recode control_cschool (2=0)
rename haveyoutalkedwithanyonewhoreceiv control_talk
rename v132 control_interest

recode untreated_interest control_interest (2=0)


replace phone = strlower(phone) 
replace phone = subinstr(phone,"na","",.)  
rename phone phone_e


// 3.2 clean endline survey: rename
lab define categories 1 "strongly disagree" 2 "disagree" 3 "neither disagree nor agree" 4 "agree" 5 "strongly agree" 99 "decline to answer", replace

* Perception about education and employment
cap ren scienceandtechnologyaremakingour b2a_e
cap lab var b2a_e "perception: science makes lives healthier, easier, comfortable"
cap ren allthingsconsideredscienceandtec b2d_e
cap lab var b2d_e "perception: science makes the world better off"
cap ren scienceeducationismoreimportantf b4a_e
cap lab var b4a_e "perception: science more important for boy"
cap ren onaveragemenhavehigherabilityins b4b_e 
cap lab var b4b_e "perception: men have higher ability in STEMM"
cap ren ifwomenhavestemmsciencetechnolog b4c_e
cap lab var b4c_e "perception: STEMM women suffer in personalsocial life"
cap ren stemmrelatedjobsareformenthanwom b4d_e
cap lab var b4d_e "perception: STEMM jobs are for men than women"
cap ren whenjobsarescarcemenshouldhavemo b5b_e
cap lab var b5b_e "perception: men should have job if scarce"
cap ren whenamotherworksforpaythechildre b5c_e
cap lab var b5c_e "perception: mother works for pay, children suffer"
cap lab val b2* b4* b5* categories

cap ren sciencehowimportantisitforyoutob b6a_e
cap lab var b6a_e "importance to be knowledgable about science"
cap ren artsandhumanitieshowimportantisi b6b_e
cap lab var b6b_e "importance to be knowledgable about arts"


* Preference about STEM-related education
cap ren mathhowmuchdoyoulikeeachofthefol c2a_e
cap lab var c2a_e "liking: math"
cap ren dzongkhahowmuchdoyoulikeeachofth c2b_e
cap lab var c2b_e "liking: dzongkha"
cap ren englishhowmuchdoyoulikeeachofthe c2c_e
cap lab var c2c_e "liking: english"
cap ren physicshowmuchdoyoulikeeachofthe c2d_e
cap lab var c2d_e "liking: physics"
cap ren chemistryhowmuchdoyoulikeeachoft c2e_e
cap lab var c2e_e "liking: chemistry"
cap ren biologyhowmuchdoyoulikeeachofthe c2f_e
cap lab var c2f_e "liking: biology"
cap ren historyandcivicshowmuchdoyoulike c2g_e
cap lab var c2g_e "liking: history"
cap ren geographyhowmuchdoyoulikeeachoft c2h_e
cap lab var c2h_e "liking: geography"
cap ren overallhowmuchdoyoulikeeachofthe c2i_e
cap lab var c2i_e "liking: overall"

* Knowledge about admission criteria of upper secondary school
cap ren artsstreamatgradexixiihowmuchdoy e1a_e 
cap lab var e1a_e "knowledge: admission Arts stream"
cap ren commercestreamatgradexixiihowmuc e1b_e 
cap lab var e1b_e "knowledge: admission Commerce stream"
cap ren sciencestreamatgradexixiihowmuch e1c_e 
cap lab var e1c_e "knowledge: admission Science stream"
cap ren rigzhungstreamatgradexixiihowmuc e1d_e 
cap lab var e1d_e "knowledge: admission Rigzhung stream"
cap ren ttiinsteadofgoingtogradexixiigen e1e_e
cap lab var e1e_e "knowledge: admission TTI"
cap lab val e1* categories


* Knowledge about career paths  (variable name different )
cap ren v73 e4a_e
cap lab var e4a_e "knowledge: career Arts stream"
cap ren v74 e4b_e
cap lab var e4b_e "knowledge: career Commerce stream"
cap ren v75 e4c_e
cap lab var e4c_e "knowledge: career Science stream"
cap ren v76 e4d_e
cap lab var e4d_e "knowledge: career Rigzhung stream"
cap ren v77 e4e_e
cap lab var e4e_e "knowledge: career TTI stream"
cap lab val e4* categories

* which is correct: Government higher secondary schools (no endline survey index)
cap ren whichofthefollowingstatementsisc admission_criteria
cap lab admission_criteria "to be eligible for admission in the Government higher secondary schools"
cap ren v79 admission_placement
cap lab admission_placement "placement and admission to class XI"
cap ren v80 entry_sci
cap lab entry_sci "the entry requirement for the Science stream in the Government higher secondary schools?"
cap ren whenstudentshavemettheentryrequi merit_order
cap lab merit_order "how would schools decide the merit order?"


* Degree ranking by genuine interest 
cap ren bachelorssciencemathcomputerscie e6a_e
cap lab var e6a_e "ranking: SCI"
cap tostring e6a_e, replace 
cap ren bachelorsengineeringpleasechoose e6b_e
cap lab var e6b_e "ranking: ENG"
cap tostring e6b_e, replace 
cap ren bachelorsmedicinehealthandnursin e6c_e
cap lab var e6c_e "ranking: MED"
cap tostring e6c_e, replace 
cap ren bachelorseducationpleasechoosean e6d_e
cap lab var e6d_e "ranking: EDU"
cap tostring e6d_e, replace 
cap ren bachelorsbusinessmanagementecono e6e_e
cap lab var e6e_e "ranking: BUS"
cap tostring e6e_e, replace 
cap ren bachelorsartshumanitiesandsocial e6f_e
cap lab var e6f_e "ranking: ARTS"
cap tostring e6f_e, replace 
cap ren technicaldegreesnationalcertific e6g_e
cap lab var e6g_e "ranking: TTI"
cap tostring e6g_e, replace 
cap ren stopeducationafterhighschoolplea e6h_e
cap lab var e6h_e "ranking: HighSchool"
cap tostring e6h_e, replace 

* Based on academic performance, difficulty
cap ren basedonyouracademicperformanceho e7c_e
cap lab var e7c_e "how difficult to enroll in Science stream"
cap ren v91 e8c_e
cap lab var e8c_e "how difficult to complete Science stream"

* How much will enjoy each stream 
cap ren artsstreamgradexixiihowmuchdoyou e9a_e
cap lab var e9a_e "enjoyability: Arts stream"
cap ren commercestreamgradexixiihowmuchd e9b_e
cap lab var e9b_e "enjoyability: Commerce stream"
cap ren sciencestreamgradexixiihowmuchdo e9c_e
cap lab var e9c_e "enjoyability: Science stream"
cap ren rigzhungstreamgradexixiihowmuchd e9d_e
cap lab var e9d_e "enjoyability: Rigzhung stream"
cap ren ttieducationaftersecondaryschool e9e_e
cap lab var e9e_e "enjoyability: TTI stream"

* Approval 
cap ren howlikelydoyouthinkthatyourparen e11c_e
cap lab var e11c_e "Approval of parents if choosing Science stream"
cap ren howlikelydoyouthinkthatyourfrien e12c_e
cap lab var e12c_e "Approval of friends if choosing Science stream"
cap ren howlikelydoyouthinkpeopleingener e13c_e
cap lab var e13c_e "Approval of society if choosing Science stream"

* Decision today
cap ren artsstreamgradexixiiifyoutodayde e14a_e 
cap tostring e14a_e, replace 
cap lab var e14a_e "ranking: Arts stream"
cap ren commercestreamgradexixiiifyoutod e14b_e
cap tostring e14b_e, replace 
cap lab var e14b_e "ranking: Commerce stream"
cap ren sciencestreamgradexixiiifyoutoda e14c_e
cap tostring e14c_e, replace 
cap lab var e14c_e "ranking: Science stream"
cap ren rigzhungstreamgradexixiiifyoutod e14d_e
cap tostring e14d_e, replace 
cap lab var e14d_e "ranking: Rigzhung stream"
cap ren v104 e14e_e
cap tostring e14e_e, replace 
cap lab var e14e_e "ranking: TTI"
cap ren stopeducationaftersecondaryschoo e14f_e
cap tostring e14f_e, replace 
cap lab var e14f_e "ranking: MiddleSchool"

* Sure about e14?
cap ren howsureareyouaboutyourchoicesand e15_e
cap lab var e15_e "How sure about e14"

* Beliefs about % of other students choice
cap ren artsstreamgradexixiiwhatpercenta e16a_m_e
cap lab var e16a_m_e "belief: % male students choosing Arts"
cap ren commercestreamgradexixiiwhatperc e16b_m_e
cap lab var e16b_m_e "belief: % male students choosing Commerce"
cap ren sciencestreamgradexixiiwhatperce e16c_m_e
cap lab var e16c_m_e "belief: % male students choosing Science"
cap ren rigzhungstreamgradexixiiwhatperc e16d_m_e
cap lab var e16d_m_e "belief: % male students choosing Rigzhung"
cap ren v111 e16e_m_e
cap lab var e16e_m_e "belief: % male students choosing TTI"
cap ren v112 e16f_m_e
cap lab var e16f_m_e "belief: % male students stop edu after"
cap ren v113 e16a_f_e
cap lab var e16a_f_e "belief: % female students choosing Arts"
cap ren v114 e16b_f_e
cap lab var e16b_f_e "belief: % female students choosing Commerce"
cap ren v115 e16c_f_e
cap lab var e16c_f_e "belief: % female students choosing Science"
cap ren v116 e16d_f_e
cap lab var e16d_f_e "belief: % female students choosing Rigzhung"
cap ren v117 e16e_f_e
cap lab var e16e_f_e "belief: % female students choosing TTI"
cap ren v118 e16f_f_e
cap lab var e16f_f_e "belief: % female students stop edu after"

rename b_month month
g b_month=.
replace b_month=1 if month=="January"
replace b_month=2 if month=="February"
replace b_month=3 if month=="March"
replace b_month=4 if month=="April"
replace b_month=5 if month=="May"
replace b_month=6 if month=="June"
replace b_month=7 if month=="July"
replace b_month=8 if month=="August"
replace b_month=9 if month=="September"
replace b_month=10 if month=="October"
replace b_month=11 if month=="November"
replace b_month=12 if month=="December"
drop month

g bday_e = b_year*10^4 + b_month*10^2 + b_day
tostring bday_e, replace 

*
rename howwouldyoudescribeyourrelations treat_relation_mentor
rename inadditiontoofficialmeetingshowo treat_communication_freq
drop thisistheendofthesurveyifyouwant
drop thissurveyisintendedforgradexstu reenteryourstudentidcode reenteryourmobilephonenumber reenteryouremailaddress reenteryourcitizenshipidcode11di
drop v33-v42
drop endlinesurveyoncareermentoringfo v22 v23



// Check for duplicated and invalid studentid 
duplicates tag studentid, g(dup_studentid)
sort studentid status responseid

* duplicated answers, keep the complete (earlier) submission
forval i=1/3{
drop if name==name[_n-1] & studentid==studentid[_n-1] & citizenid==citizenid[_n-1] & dup_studentid~=0
drop if name==name[_n-1] & b_day==b_day[_n-1] & b_month==b_month[_n-1] & b_year==b_year[_n-1] & sex==sex[_n-1] &  dup_studentid~=0 
}

drop dup_studentid
duplicates tag studentid, g(dup_studentid)

* for invalid studentID and citizenID: attempt to recover studentid later on
sort studentid responseid
rename responseid responseid_e

sort responseid_e 
rename status status_e

drop if inlist(studentid, "000.00000.00.0000", "111.22222.33.4444") | strlen(studentid)~=17 | (missing(citizenid) & dup_studentid~=0)
save "$clean/grade10_endline.dta", replace

//labeling
use "$clean/grade10_endline.dta", clear
lab define importance 1 "not at all important" 2 "sligtly important" 3 "somewhat important" 4 "very important" 5 "extremely important" 99 "decline to answer", replace
cap lab val b6* importance

lab define difficulty 1 "very difficult" 2 "quite difficult" 3 "moderate" 4 "quite easy" 5 "very easy" 99 "don't know/decline to answer", replace
cap lab val e7c* e8c* difficulty

lab define howmuch 1 "not at all" 2 "slightly" 3 "moderate" 4 "very much" 5 "extremely" 99 "don't know/decline to answer", replace
cap lab val e9* howmuch
cap lab val c2* importance

lab define howlikely 1 "very unlikely" 2 "unlikely" 3 "possible" 4 "likely" 5 "very likely" 99 "decline to answer", replace
cap lab val e11c_e e12c_e e13c_e howlikely 

lab define howsure 1 "not at all sure" 2 "slightly sure" 3 "somewhat sure" 4 "moderately sure" 5 "extremely sure" 99 "decline to answer", replace
cap lab val e15_e howsure

lab define relation 1 "very poor" 2 "poor" 3 "fair" 4 "good" 5 "very good", replace
cap lab val treat_relation_mentor relation
cap lab val c1* relation


lab define rateexper 1 "very dissatisfied" 2 "dissatisfied" 3 "neutral" 4 "satisfied" 5 "very satisfied", replace
cap lab val treat_satisfaction_mentor treat_satisfaction_program rateexper


lab define likelycontact 1 "very unlikely" 2 "unlikely" 3 "possible" 4 "likely" 5 "very likely", replace
cap lab val treat_contact likelycontact

lab define freq 1 "often" 2 "sometimes" 3 "rarely" 4 "never", replace
cap lab val control_talk untreated_talk freq

save "$clean/grade10_endline.dta", replace


***********************************************************
// 3.3 merge with baseline and endline survey separately and combinedly
* match baseline unique id
use "$clean/grade10_match_id.dta", clear
cap rename responseid_b responseid 
cap rename schoolname_b schoolname
merge 1:m responseid schoolname using "$clean/grade910_baseline_wgrade.dta"
keep if _merge==3  // 15,449 not matched, 6,458 matched
drop _merge 
rename * *_bl
rename uniqueid_bl uniqueid
save "$temp/grade10_baseline_merged_unique.dta", replace


* match endline unique id
use "$clean/grade10_endline.dta", clear 
rename *_e *
ren schoolname schoolname_e 
ren responseid responseid_e 
merge 1:m responseid_e schoolname_e using "$clean/grade10_match_id.dta"
keep if _merge==3  // 901 not matched, 6189 matched
drop _merge 
cap rename studentid studentid_e
save "$temp/grade10_endline_merged_unique.dta", replace

* build data set for analysis 
use "$temp/grade10_baseline_merged_unique.dta", clear 
merge 1:1 uniqueid using "$temp/grade10_endline_merged_unique.dta"
drop _merge 

lab define howmuch2 1 "not at all" 2 "slightly" 3 "moderate" 4 "very much" 5 "extremely" 99 "decline to answer", replace
cap lab val z1* howmuch2

cap lab val e11c e12c e13c howlikely 

lab define howlikely 1 "very unlikely" 2 "unlikely" 3 "possible" 4 "likely" 5 "very likely" 99 "decline to answer", replace
cap lab val e11c e12c e13c howlikely 

save "$temp/grade10_analysis.dta", replace

* obtain mentorid 
clear 
import excel "$randomization/final/randomization_STEM_mainlist_9Oct2021.xlsx", sheet("Sheet1") firstrow
keep mentorid menteeid studentid mentor_name mentor_email mentor_phone
rename studentid studentid_bl

* merge with mentor dataset 
merge m:1 mentorid using "$clean/stem_mentor", nogen keep(matched)
merge 1:1 studentid_bl using "$temp/grade10_analysis.dta", nogen 

order mentor*, a(dup_studentid)  
drop dup_studentid

cap g drop b_month = .
replace b_month = 1 if b_month_bl=="January"
replace b_month = 2 if b_month_bl=="February"
replace b_month = 3 if b_month_bl=="March"
replace b_month = 4 if b_month_bl=="April"
replace b_month = 5 if b_month_bl=="May"
replace b_month = 6 if b_month_bl=="June"
replace b_month = 7 if b_month_bl=="July"
replace b_month = 8 if b_month_bl=="August"
replace b_month = 9 if b_month_bl=="September"
replace b_month = 10 if b_month_bl=="October"
replace b_month = 11 if b_month_bl=="November"
replace b_month = 12 if b_month_bl=="December"

g double dateofbirth = b_year_bl*10^4 + b_month*10^2 + b_day_bl
format dateofbirth %15.0f
tostring dateofbirth, replace 
lab var dateofbirth "date of birth YYYYMMDD"

save "$temp/stem_analysis.dta", replace


********************************************************************************
*
*		CLEAN TEST SCORE DATASET 
*
********************************************************************************
do "$dofiles/4.1.4.testscore_grade10.do"

********************************************************************************
*
*		PROCESSING DATA FOR ANALYSIS
*
********************************************************************************
use "$temp/stem_analysis.dta", clear

// programming standardization 
cap program drop standardization 
program standardization, rclass 
	args i
	cap drop z_`i' 
	recode `i' (99=.)
	su `i' if CONTROL == 1, de
	local mean = r(mean)
	local sd = r(sd)
	g z_`i' = (`i' - `mean')/`sd'
end
// programming standardization with missing values 
cap program drop standardization_nomissing 
program standardization_nomissing, rclass 
	args i
	cap drop z_`i' 
	su `i' if CONTROL == 1, de
	local mean = r(mean)
	local sd = r(sd)
	g z_`i' = (`i' - `mean')/`sd'
end


// Create relevant variables for analysis: Others 
* Attrition & treatment status 
g attrition = missing(status)
lab var attrition "indicator for attrited student in first follow-up survey"

g TREAT = treatstudent_bl==1 // treated students 
lab var TREAT "indicator for treatment student"
g SPILL = treatstudent_bl==0 & treatschool_bl==1 // spillover students 
lab var SPILL "indicator for spillover student"
g CONTROL = treatstudent_bl==0 & treatschool_bl==0 
lab var CONTROL "indicator for pure control student"

g smallsample = ~(TREAT==0 & SPILL==0) // only treated school 
lab var smallsample "indicator for treatment schools"

* School variable  
egen school = group(schoolname_bl)
lab var school "school code"

* Socioeconomic status: age, gender, parents having at least high school education, wealth 
g age_bl = 2021 - b_year_bl
lab var age_bl "student age"

g male = sex_bl==1 
lab var male "indicator for being male student"

g father_edu = x3_bl>=6
g mother_edu = x7_bl>=6
replace father_edu = . if x3_bl==99
replace mother_edu = . if x7_bl==99

lab var father_edu "father completed high school and beyond"
lab var mother_edu "mother completed high school and beyond"

recode x9*_bl (2=0) // 2 means no, 1 means yes
recode x9*_bl (99=.)
pca x9*_bl
predict pc1, score
rename pc1 wealth_bl 
standardization wealth_bl 
lab var z_wealth_bl "wealth index (PCA)"
lab var wealth_bl "wealth (PCA)"

* Education-related characteristics: liking for STEM subjects (zscore: c2), parents mentoring stream choice (c3), self-reported academic performance (c1)
cap drop like*
g like_math_bl =(c2a_bl*(c2a_bl~=99))/(c2a_bl~=99)
g like_scie_bl = (c2d_bl*(c2d_bl~=99) + c2e_bl*(c2e_bl~=99) + c2f_bl*(c2f_bl~=99))/((c2d_bl~=99) + (c2e_bl~=99) + (c2f_bl~=99))
g like_oths_bl = (c2b_bl*(c2b_bl~=99) + c2c_bl*(c2c_bl~=99) + c2g_bl*(c2g_bl~=99) + c2h_bl*(c2h_bl~=99))/((c2b_bl~=99) + (c2c_bl~=99) + (c2g_bl~=99) + (c2h_bl~=99))
g like_all_bl = c2i_bl

g like_math = (c2a*(c2a~=99))/(c2a~=99)
g like_scie = (c2d*(c2d~=99) + c2e*(c2e~=99) + c2f*(c2f~=99))/((c2d~=99) + (c2e~=99) + (c2f~=99))
g like_oths = (c2b*(c2b~=99) + c2c*(c2c~=99) + c2g*(c2g~=99) + c2h*(c2h~=99))/((c2b~=99) + (c2c~=99) + (c2g~=99) + (c2h~=99))
g like_all = c2i

foreach variable in math scie oths all {
	standardization like_`variable'_bl
	standardization like_`variable'
}

foreach variable in c2d c2e c2f {
	standardization `variable'_bl
	standardization `variable'
}

lab var z_c2d "Interest in physics index" 
lab var z_c2e "Interest in chemistry index" 
lab var z_c2f "Interest in biology index"
lab var z_like_math "Interest in math index" 
lab var z_like_scie "Interest in science subjects index"
lab var z_like_oths "Interest in non-science subjects"
lab var z_like_all "Interest in studying overall"

lab var z_like_math_bl "Interest in math index" 
lab var z_like_scie_bl "Interest in science subjects index"
lab var z_like_oths_bl "Interest in non-science subjects index"
lab var z_like_all_bl "Interest in studying overall index"

lab var like_math_bl "Interest in math" 
lab var like_scie_bl "Interest in science subjects"

g mt_parents = c3c1_bl
lab var mt_parents "talk with parents about stream choice"
g mt_siblings = c3c2_bl
lab var mt_siblings "talk with siblings about stream choice"
g mt_friends = c3c4_bl
lab var mt_friends "talk with friends about stream choice"
g mt_teachers = c3c6_bl
lab var mt_teachers "talk with teachers about stream choice"

g perf_math_bl = c1a_bl/(c1a_bl~=99)
g perf_scie_bl = (c1d_bl*(c1d_bl~=99) + c1e_bl*(c1e_bl~=99) + c1f_bl*(c1f_bl~=99))/((c1d_bl~=99) + (c1e_bl~=99) + (c1f_bl~=99))
g perf_oths_bl = (c1b_bl*(c1b_bl~=99) + c1c_bl*(c1c_bl~=99) + c1g_bl*(c1g_bl~=99) + c1h_bl*(c1h_bl~=99))/((c1b_bl~=99) + (c1c_bl~=99) + (c1g_bl~=99) + (c1h_bl~=99))
g perf_all_bl = c1i_bl


lab var perf_math_bl "Subjective performance: math"
lab var perf_scie_bl "Subjective performance: science"

foreach variable in math scie oths all {
	standardization perf_`variable'_bl
}

foreach variable in c1d c1e c1f {
	standardization `variable'_bl
}

lab var z_c1d_bl "Subjective performance index: physics"
lab var z_c1e_bl "Subjective performance index: chemistry"
lab var z_c1f_bl "Subjective performance index: biology"
lab var z_perf_math_bl "Subjective performance index: math"
lab var z_perf_scie_bl "Subjective performance index: science subjects"
lab var z_perf_oths_bl "Subjective performance index: non-science subjects"
lab var z_perf_all_bl "Subjective performance index: overall"

* Personality (zscore)
g big5extro_bl = ((5 - h1a_bl)*(h1a_bl~=99) + h1c_bl*(h1c_bl~=99) + h1i_bl*(h1i_bl~=99))/((h1a_bl~=99) + (h1c_bl~=99) + (h1i_bl~=99))
g big5agree_bl = ((5 - h1l_bl)*(h1l_bl~=99) + h1m_bl*(h1m_bl~=99) + h1n_bl*(h1n_bl~=99))/((h1l_bl~=99) + (h1m_bl~=99) + (h1n_bl~=99))
g big5consci_bl = (h1d_bl*(h1d_bl~=99) + (5 - h1b_bl)*(h1b_bl~=99) + h1o_bl*(h1o_bl~=99))/((h1d_bl~=99) + (h1b_bl~=99) + (h1o_bl~=99))
g big5neuro_bl = (h1e_bl*(h1e_bl~=99) + h1g_bl*(h1g_bl~=99) + (5 - h1h_bl)*(h1h_bl~=99))/((h1e_bl~=99) + (h1g_bl~=99) + (h1h_bl~=99))
g big5open_bl = (h1j_bl*(h1j_bl~=99) + h1k_bl*(h1k_bl~=99) + h1f_bl*(h1f_bl~=99))/((h1j_bl~=99) + (h1k_bl~=99) + (h1f_bl~=99))

foreach variable in big5extro big5agree big5consci big5neuro big5open {
	standardization `variable'_bl
}
lab var z_big5extro_bl "Big 5 extroversion index"
lab var z_big5agree_bl "Big 5 agreeness index"
lab var z_big5consci_bl "Big 5 conscientiousness index"
lab var z_big5neuro_bl "Big 5 neuroticism index"
lab var z_big5open_bl "Big 5 openness index"

lab var big5extro_bl "Big 5 extroversion"
lab var big5agree_bl "Big 5 agreeness"
lab var big5consci_bl "Big 5 conscientiousness"
lab var big5neuro_bl "Big 5 neuroticism"
lab var big5open_bl "Big 5 openness"

// Create relevant variables for analysis: Primary Outcomes 

* Primary outcome: Preferences on higher secondary education: students' self-reported ranking of educational choices including Arts stream, Commerce stream, Science stream, Rigzhung stream at upper secondary school, Technical and Vocational Education and Training (TVET), and stop education after lower secondary school. 

**** Science 

g i_sci_top2_bl = inlist(e14c_bl, "1", "2") // this question has no "I don't know" option so no missing data
g i_sci_top2 = inlist(e14c, "1", "2") 
replace i_sci_top2 = . if attrition 

g i_sci_top1_bl = inlist(e14c_bl, "1")
g i_sci_top1 = inlist(e14c, "1")
replace i_sci_top1 = . if attrition 

lab var i_sci_top2_bl "Science as top 2 choice after grade 10 (dummy)"
lab var i_sci_top2 "Science as top 2 choice after grade 10 (dummy)"
lab var i_sci_top1_bl "Science as top 1 choice after grade 10 (dummy)"
lab var i_sci_top1 "Science as top 1 choice after grade 10 (dummy)"

**** Arts 

g i_art_top2_bl = inlist(e14a_bl, "1", "2") // this question has no "I don't know" option so no missing data
g i_art_top2 = inlist(e14a, "1", "2") 
replace i_art_top2 = . if attrition 

g i_art_top1_bl = inlist(e14a_bl, "1")
g i_art_top1 = inlist(e14a, "1")
replace i_art_top1 = . if attrition 

lab var i_art_top2_bl "Arts as top 2 choice after grade 10 (dummy)"
lab var i_art_top2 "Arts as top 2 choice after grade 10 (dummy)"
lab var i_art_top1_bl "Arts as top 1 choice after grade 10 (dummy)"
lab var i_art_top1 "Arts as top 1 choice after grade 10 (dummy)"

**** Commerce 

g i_com_top2_bl = inlist(e14b_bl, "1", "2") // this question has no "I don't know" option so no missing data
g i_com_top2 = inlist(e14b, "1", "2") 
replace i_com_top2 = . if attrition 

g i_com_top1_bl = inlist(e14b_bl, "1")
g i_com_top1 = inlist(e14b, "1")
replace i_com_top1 = . if attrition 

lab var i_com_top2_bl "Commerce as top 2 choice after grade 10 (dummy)"
lab var i_com_top2 "Commerce as top 2 choice after grade 10 (dummy)"
lab var i_com_top1_bl "Commerce as top 1 choice after grade 10 (dummy)"
lab var i_com_top1 "Commerce as top 1 choice after grade 10 (dummy)"


**** Rigzhung

g i_rig_top2_bl = inlist(e14d_bl, "1", "2") // this question has no "I don't know" option so no missing data
g i_rig_top2 = inlist(e14d, "1", "2") 
replace i_rig_top2 = . if attrition 

g i_rig_top1_bl = inlist(e14d_bl, "1")
g i_rig_top1 = inlist(e14d, "1")
replace i_rig_top1 = . if attrition 

lab var i_rig_top2_bl "Rigzhung as top 2 choice after grade 10 (dummy)"
lab var i_rig_top2 "Rigzhung as top 2 choice after grade 10 (dummy)"
lab var i_rig_top1_bl "Rigzhung as top 1 choice after grade 10 (dummy)"
lab var i_rig_top1 "Rigzhung as top 1 choice after grade 10 (dummy)"

**** TTI
g i_TVET_top2_bl = inlist(e14e_bl, "1", "2") // this question has no "I don't know" option so no missing data
g i_TVET_top2 = inlist(e14e, "1", "2") 
replace i_TVET_top2 = . if attrition 

g i_TVET_top1_bl = inlist(e14e_bl, "1")
g i_TVET_top1 = inlist(e14e, "1")
replace i_TVET_top1 = . if attrition 

lab var i_TVET_top2_bl "TVET as top 2 choice after grade 10 (dummy)"
lab var i_TVET_top2 "TVET as top 2 choice after grade 10 (dummy)"
lab var i_TVET_top1_bl "TVET as top 1 choice after grade 10 (dummy)"
lab var i_TVET_top1 "TVET as top 1 choice after grade 10 (dummy)"

gen i_rank_sure_bl = e15_bl
recode i_rank_sure_bl (99=.)
lab var i_rank_sure_bl "How sure on ranking majors"

gen i_rank_sure = e15
recode i_rank_sure (99=.)
lab var i_rank_sure "How sure on ranking majors"



// Create relevant variables for analysis: Secondary Outcomes 
* Study preparation

* Test score in the lower secondary school graduation exam "Bhutan Secondary Education Certificate" (bcse) 
* merge with exam scores 
merge 1:1 uniqueid using "$clean/exam_score10.dta", nogen 

g missing_score = missing(indexnumber)
lab var missing_score "missing score data in bcse exam"

gen score_scie = (score_phy*(score_phy~=.)  + score_che*(score_che~=.)  + score_bio*(score_bio~=.))/((score_phy~=.)+(score_che~=.)+(score_bio~=.))

foreach i in score_eng score_dzo score_math score_phy score_che score_bio score_scie {
	standardization_nomissing `i'
}

lab var z_score_eng "BCSE english score index"
lab var z_score_dzo "BCSE dzongkhag score index"
lab var z_score_math "BCSE math score index"
lab var z_score_bio "BCSE biology score index"
lab var z_score_phy "BCSE physics score index"
lab var z_score_che "BCSE chemistry score index"
lab var z_score_scie "BCSE science score index"


* Preferences on university-level education: e6 (having a STEM-related majors as top 2)
g i_e6_bl = inlist(e6a_bl, "1", "2") | inlist(e6b_bl, "1", "2") | inlist(e6c_bl, "1", "2")
g i_e6 = inlist(e6a, "1", "2") | inlist(e6b, "1", "2") | inlist(e6c, "1", "2")
replace i_e6 = . if attrition 
lab var i_e6_bl "STEM-related college majors as top 2 choice after grade 12 (dummy)"
lab var i_e6 "STEM-related college majors as top 2 choice after grade 12 (dummy)"


* Index: Attitude to science and technology: b2a, b2d (zscore)
g attitude_stem_bl = (b2a_bl*(b2a_bl~=99) + b2d_bl*(b2d_bl~=99))/((b2a_bl~=99) + (b2d_bl~=99))
g attitude_stem = (b2a*(b2a~=99) + b2d*(b2d~=99))/((b2a~=99) + (b2d~=99))
standardization attitude_stem_bl
standardization attitude_stem
lab var z_attitude_stem_bl "Attitude to science and technology index"
lab var z_attitude_stem "Attitude to science and technology index"

	* specific component 
	standardization b2a_bl
	standardization b2a
	lab var z_b2a_bl "Index: science makes lives healthier easier comfortable"
	lab var z_b2a "Index: science makes lives healthier easier comfortable"
	
	standardization b2d_bl
	standardization b2d 
	lab var z_b2d_bl "Index: science makes the world better off"
	lab var z_b2d "Index: science makes the world better off"
	

* Index: Gender bias related to STEM and employment: -b4a, -b4b, -b4c, -b4d, -b5b, -b5c (zscore)
* Higher value means more biased against females in STEM 
g genderbias_bl = (b4a_bl*(b4a_bl~=99) + b4b_bl*(b4b_bl~=99) + b4c_bl*(b4c_bl~=99) + b4d_bl*(b4d_bl~=99) + b5b_bl*(b5b_bl~=99) + b5c_bl*(b5c_bl~=99))/((b4a_bl~=99) + (b4b_bl~=99) + (b4c_bl~=99) + (b4d_bl~=99) + (b5b_bl~=99) + (b5c_bl~=99))
g genderbias = (b4a*(b4a~=99) + b4b*(b4b~=99) + b4c*(b4c~=99) + b4d*(b4d~=99) + b5b*(b5b~=99) + b5c*(b5c~=99))/((b4a~=99) + (b4b~=99) + (b4c~=99) + (b4d~=99) + (b5b~=99) + (b5c~=99))
standardization genderbias_bl
standardization genderbias
lab var z_genderbias_bl "Bias against women in STEM index"
lab var z_genderbias "Bias against women in STEM index"

	* specific components
	foreach variable in b4a b4b b4c b4d b5b b5c {
	standardization `variable'_bl
	standardization `variable'
	}
	
	lab var z_b4a "Index: science is more important for boys than girls"
	lab var z_b4a_bl "Index: science is more important for boys than girls"
	lab var z_b4b "Index: men have higher ability in STEMM than women"
	lab var z_b4b_bl "Index: men have higher ability in STEMM than women"
	lab var z_b4c "Index: STEMM women suffer in personal and social life"
	lab var z_b4c_bl "Index: STEMM women suffer in personal and social life"
	lab var z_b4d "Index: STEMM jobs are for men than women"
	lab var z_b4d_bl "Index: STEMM jobs are for men than women"
	lab var z_b5b "Index: men should have job if scarce"
	lab var z_b5b_bl "Index: men should have job if scarce"
	lab var z_b5c "Index: mother works for pay children suffer"
	lab var z_b5c_bl "Index: mother works for pay children suffer"
	
	
* Subjective knowledge on science stream (zscore, e1c, e4c)
g subj_sci_bl = (e1c_bl*(e1c_bl~=99) + e4c_bl*(e4c_bl~=99))/((e1c_bl~=99) + (e4c_bl~=99))
g subj_sci = (e1c*(e1c~=99) + e4c*(e4c~=99))/((e1c~=99) + (e4c~=99))
	
standardization subj_sci_bl
standardization subj_sci

lab var z_subj_sci_bl "Subjective knowledge on science stream index"
lab var z_subj_sci "Subjective knowledge on science stream index"

	* specific component 
	standardization e1c
	standardization e1c_bl
	lab var z_e1c "Index: subjective knowledge admission to science stream"
	lab var z_e1c_bl "Index: subjective knowledge admission to science stream"

	standardization e4c
	standardization e4c_bl
	lab var z_e4c "Index: subjective knowledge career path of science stream"
	lab var z_e4c_bl "Index: subjective knowledge career path of science stream"
	
	
* Objective knowledge about entry requirements and application process 
g obj_sci = ((admission_criteria==2) + (admission_placement==2) + (entry_sci==2) + (merit_order==2 ))/(~missing(admission_criteria) + ~missing(admission_placement) + ~missing(entry_sci) + ~missing(merit_order))

standardization obj_sci
lab var z_obj_sci "Objective knowledge on science stream index"
cap drop obj_sci
	
	* specific component 
	g i_obj_sci_crt = (admission_criteria==2)/(~missing(admission_criteria))
	g i_obj_sci_plm = (admission_placement==2)/(~missing(admission_placement))
	g i_obj_sci_ent = (entry_sci==2)/(~missing(entry_sci))
	g i_obj_sci_merit = (merit_order==2)/(~missing(merit_order))
		
	lab var i_obj_sci_crt "Objective knowlege: admission eligibility (dummy)"
	lab var i_obj_sci_plm "Objective knowlege: admission procedure (dummy)"
	lab var i_obj_sci_ent "Objective knowlege: entry criteria (dummy)"
	lab var i_obj_sci_merit "Objective knowlege: merit ordering (dummy)"

* Expected satisfaction on Science stream: e9
standardization e9c
standardization e9c_bl
lab var z_e9c "Expected satisfaction on science stream index"
lab var z_e9c_bl "Expected satisfaction on science stream index"

* zscore Beliefs about approval of parents, peers, and society if enrolling in Science stream
cap drop approval* 
g approval_sci = (e11c*(e11c~=99) + e12c*(e12c~=99) + e13c*(e13c~=99))/((e11c~=99) + (e12c~=99) + (e13c~=99))
g approval_sci_bl = (e11c_bl*(e11c_bl~=99) + e12c_bl*(e12c_bl~=99) + e13c_bl*(e13c_bl~=99))/((e11c_bl~=99) + (e12c_bl~=99) + (e13c_bl~=99))

standardization approval_sci
standardization approval_sci_bl 
lab var z_approval_sci "Approval index if choosing science"
lab var z_approval_sci_bl "Approval index if choosing science"

	* specific component 
	standardization e11c
	standardization e11c_bl
	lab var z_e11c "Approval index if choosing science: parents"
	lab var z_e11c_bl "Approval index if choosing science: parents"

	standardization e12c
	standardization e12c_bl
	lab var z_e12c "Approval index if choosing science: friends"
	lab var z_e12c_bl "Approval index if choosing science: friends"

	standardization e13c
	standardization e13c_bl
	lab var z_e13c "Approval index if choosing science: society"
	lab var z_e13c_bl "Approval index if choosing science: society"

* zscore Beliefs about peers' preferences on higher secondary education: e16c 
foreach i in f m {
replace e16c_`i'=substr(e16c_`i', 1, length(e16c_`i') - 1)
destring e16c_`i', replace 
}
foreach i in f m {
replace e16c_`i'_bl=substr(e16c_`i'_bl, 1, length(e16c_`i'_bl) - 1)
destring e16c_`i'_bl, replace 
}

g e16c = e16c_f if male==0
replace e16c = e16c_m if sex==1
standardization e16c
lab var z_e16c "Belief index about peers' preference in science"

g e16c_bl = e16c_f_bl if male==0
replace e16c_bl = e16c_m_bl if male==1
standardization e16c_bl
lab var z_e16c_bl "Belief index about peers' preference in science"

* Index: Subjective assessment of ability to enroll (e7) and complete (e8) Science stream
cap drop ability 
g ability = (e7c*(e7c~=99) + e8c*(e8c~=99))/((e7c~=99) + (e8c~=99))
g ability_bl = (e7c_bl*(e7c_bl~=99) + e8c_bl*(e8c_bl~=99))/((e7c_bl~=99) + (e8c_bl~=99))

standardization ability 
standardization ability_bl
lab var z_ability "Subjective assessment to enroll and complete science stream index"
lab var z_ability_bl "Subjective assessment to enroll and complete science stream index"

	* specific component 
	standardization e7c
	standardization e7c_bl
	lab var z_e7c "Subjective assessment to enroll in science stream index"
	lab var z_e7c_bl "Subjective assessment to enroll in science stream index"

	standardization e8c
	standardization e8c_bl
	lab var z_e8c "Subjective assessment to complete science stream index"
	lab var z_e8c_bl "Subjective assessment to complete science stream index"


// Mentor and mentee 
* gender combination
cap g genderindex=.
replace genderindex = 1 if mentor_male==1 & male==1
replace genderindex = 2 if mentor_male==0 & male==0
replace genderindex = 3 if mentor_male==1 & male==0
replace genderindex = 4 if mentor_male==0 & male==1
lab var genderindex "mentor-mentee gender pair"
lab def genderindex 1 "male-male" 2 "female-female" 3 "male-female" 4 "female-male"
lab val genderindex genderindex

// merge with potential share of mentees 
preserve 
do "$dofiles/4.0.4.potential_share_mentees_g10.do"
restore 

merge m:1 schoolname_bl using "$clean/sharementee_grade10.dta", nogen keep(matched)
lab var potentialshare "Potential share of mentees in each school"

g a_adm_tvet = 0
g a_app_tvet = 0
g a_adm_tvet_dr = 0
lab var a_adm_tvet_dr "admission to TVET (administrative data w/o driving centers)"
g a_app_tvet_dr = 0
lab var a_app_tvet_dr "application to TVET (administrative data w/o driving centers)"
g a_adm_tvet_drpr = 0
lab var a_adm_tvet_drpr "admission to TVET (administrative data w/o driving centers and private institutes)"
g a_app_tvet_drpr = 0
lab var a_app_tvet_drpr "application to TVET (administrative data w/o driving centers and private institutes)"

g r_repeat_data = 0
lab var r_repeat_data "repeat grade 10 (administrative data)"

save "$temp/grade10_analysis.dta", replace 


// merge with degree decision data from followup survey, repeat list, and TVET database

do "$dofiles/4.1.1.stream_outcome_grade10.do"

// merge with TVET list 

do "$dofiles/4.1.2.tvet_admission_grade10.do"

// merge with repeating grade data 

do "$dofiles/4.1.3.repeat_grade10.do"


********************************* merge data together **************************
clear 
use "$temp/grade10_analysis.dta", clear 
merge 1:1 uniqueid using "$temp/grade10_analysis_gradeadm", nogen force
merge 1:1 uniqueid using "$temp/grade10_analysis_tvetadm", nogen force
merge 1:1 uniqueid using "$temp/grade10_analysis_repeat", nogen force

// merge with mentor report dataset 
preserve 
keep if ~missing(menteeid)
replace menteeid = lower(menteeid)
merge 1:1 menteeid using "$clean/mentor_report.dta", nogen keep(master match)
tempfile data1 
save `data1'
restore 
drop if ~missing(menteeid)
append using `data1'

gen actualTREAT = 1 if ~missing(num_meeting)
replace actualTREAT = 0 if missing(num_meeting)

gen actualTREAT4 = 1 if num_meeting == 4
replace actualTREAT4 = 0 if num_meeting~=4

lab var actualTREAT "participated in the program at least 1"
lab var actualTREAT4 "participated in all 4 meetings"

g o_finoutcomes = 1 if s_stream_data == 1 | a_adm_data == 1 | r_repeat_data == 1
lab var o_finoutcomes "final outcome available in either grade11adm/tvetadm/repeat data"

replace o_finoutcomes = 0 if missing(o_finoutcomes)

cap drop strata
cap ren strata_bl strata
***** KEEP RELEVANT VARIABLES FOR ANALYSIS 

gen TREAT_MM = genderindex == 1
gen TREAT_FF = genderindex == 2
gen TREAT_MF = genderindex == 3
gen TREAT_FM = genderindex == 4

gen TREAT_SAME = TREAT_MM == 1 | TREAT_FF == 1
gen TREAT_DIFF = TREAT_MF == 1 | TREAT_FM == 1

lab var TREAT_MM "Male Mentor - Male Mentee"
lab var TREAT_FF "Female Mentor - Female Mentee"
lab var TREAT_MF "Male Mentor - Female Mentee"
lab var TREAT_FM "Female Mentor - Male Mentee"
lab var TREAT_SAME "Same Gender Mentor-Mentee"
lab var TREAT_DIFF "Different Genders Mentor-Mentee"

cap ren science_stream i_sci_enrolled 
recode i_sci_enrolled (.=0)
cap ren attrition attrition1 

ren missing_score attrition_test

cap gen treat_incontact = inlist(treat_contact, "Maybe", "Likely", "Very likely")
replace treat_incontact = . if missing(treat_contact)
lab var treat_incontact "Maybe, likely or very likely to keep in touch with mentors"

cap gen treat_incontactL = inlist(treat_contact, "Likely", "Very likely")
replace treat_incontactL = . if missing(treat_contact)
lab var treat_incontactL "Likely or very likely to keep in touch with mentors"

* Primary outcomes and values at baseline 
gl primary1 "i_sci_top1 i_art_top1 i_com_top1 i_rig_top1 i_TVET_top1 i_rank_sure"
gl primary1_bl "i_sci_top1_bl i_art_top1_bl i_com_top1_bl i_rig_top1_bl i_TVET_top1_bl i_rank_sure_bl"

gl primary2 "i_sci_enrolled"

* Secondary outcomes and values at baseline
gl secondary "z_score_math z_score_phy z_score_che z_score_bio z_score_scie passing_bcse"

* Intermediate outcomes and values at baseline 
gl intermediate1 "i_e6 z_like_math z_like_scie z_like_all like_math_bl like_scie_bl like_oths_bl like_all_bl"
gl intermediate1_bl "i_e6_bl z_like_math_bl z_like_scie_bl z_like_all_bl"

gl intermediate2 "z_attitude_stem z_b2a z_b2d"
gl intermediate2_bl "attitude_stem_bl z_attitude_stem_bl z_b2a_bl z_b2d_bl"

gl intermediate3 "z_genderbias z_b4a z_b4b z_b4c z_b4d z_b5b z_b5c"
gl intermediate3_bl "genderbias_bl z_genderbias_bl z_b4a_bl z_b4b_bl z_b4c_bl z_b4d_bl z_b5b_bl z_b5c_bl"

gl intermediate4 "z_subj_sci "
gl intermediate4_bl "subj_sci_bl z_subj_sci_bl"

gl intermediate5 "z_obj_sci i_obj_sci_crt i_obj_sci_plm i_obj_sci_ent i_obj_sci_merit"

gl intermediate6 "z_e9c"
gl intermediate6_bl "z_e9c_bl e9c_bl"

gl intermediate7 "z_approval_sci z_e11c z_e12c z_e13c"
gl intermediate7_bl "approval_sci_bl z_approval_sci_bl z_e11c_bl z_e12c_bl z_e13c_bl"

gl intermediate8 "z_e16c"
gl intermediate8_bl "z_e16c_bl e16c_bl"

gl intermediate9 "z_ability z_e7c z_e8c"
gl intermediate9_bl "z_ability_bl ability_bl z_e7c_bl z_e8c_bl"

* Control variables 
gl students_ses "male age_bl father_edu mother_edu z_wealth_bl z_big5extro_bl z_big5agree_bl z_big5consci_bl z_big5neuro_bl z_big5open_bl big5extro_bl big5agree_bl big5consci_bl big5neuro_bl big5open_bl"
gl students_aca "z_perf_math_bl z_perf_scie_bl z_perf_oths_bl z_perf_all_bl perf_math_bl perf_scie_bl perf_oths_bl perf_all_bl"
gl mentors "zm_b5extro zm_b5agree zm_b5consci zm_b5neuro zm_b5open zm_genderbias"
gl others "attrition1 potentialshare mt_* attrition_test"
gl feedback "treat_satisfaction_mentor treat_satisfaction_program treat_incontact treat_incontactL"
gl meeting "num_meeting meeting_minutes1 meeting_minutes2 meeting_minutes3 meeting_minutes4 qual_meeting num_online grade interval communication communication_m"

foreach variable of global feedback {
	replace `variable' = . if TREAT!=1
}

keep uniqueid strata school TREAT* actualTREAT* SPILL CONTROL $primary1 ///
$primary1_bl $primary2 $secondary $intermediate1 $intermediate1_bl ///
$intermediate2 $intermediate2_bl $intermediate3 $intermediate3_bl ///
$intermediate4 $intermediate4_bl $intermediate5 ///
$intermediate6 $intermediate6_bl $intermediate7 $intermediate7_bl ///
$intermediate8 $intermediate8_bl $intermediate9 $intermediate9_bl ///
$students_ses $students_aca $mentors $others $feedback $meeting

* Label variables 
lab var age_bl "Age"
lab var male "Male"
lab var father_edu "Father with high school diploma (dummy)"
lab var mother_edu "Mother with high school diploma (dummy)"
lab var z_wealth_bl "Wealth index (PCA)"
lab var potentialshare "Potential share of mentees"
lab var subj_sci_bl "Subjective knowledge on science stream"
lab var approval_sci_bl "Beliefs about approval of parents, friends and society (5-point Likert)"
lab var genderbias_bl "Bias against women in STEM (5-point Likert)"
lab var attitude_stem_bl "Attitude toward science and technology (5-point Likert)"
lab var mt_parents "Discuss with parents about career choice (dummy)" 
lab var mt_siblings "Discuss with siblings about career choice (dummy)"  
lab var mt_friends "Discuss with friends about career choice (dummy)"  
lab var mt_teachers "Discuss with teachers about career choice (dummy)"
lab def male 0 "Female" 1 "Male", modify
lab val male male

drop grade 

lab var e16c_bl "Belief about peers' preference in science"
lab var ability_bl "Subjective assessment to enroll and complete science stream"
lab var e9c_bl "Expected satisfaction on science stream"
 
save "$clean/grade10_analysis", replace 










********************************************************************************
********************************************************************************
******************* BASELINE CHARACTERISTICS OF THE WHOLE SAMPLE ***************
********************************************************************************
********************************************************************************
foreach filename in grade910_baseline_reachable_wgrade all_grade910_baseline {
	
use "$clean/`filename'.dta", clear

rename * *_bl

* Socioeconomic status: age, gender, parents having at least high school education, wealth 
g age_bl = 2021 - b_year_bl
lab var age_bl "student age"

g male = sex_bl==1 
lab var male "indicator for being male student"

g father_edu = x3_bl>=6
g mother_edu = x7_bl>=6
replace father_edu = . if x3_bl==99
replace mother_edu = . if x7_bl==99

lab var father_edu "father completed high school and beyond"
lab var mother_edu "mother completed high school and beyond"

recode x9*_bl (2=0) // 2 means no, 1 means yes
recode x9*_bl (99=.)
pca x9*_bl
predict pc1, score
rename pc1 wealth_bl 


* Education-related characteristics: liking for STEM subjects (zscore: c2), parents mentoring stream choice (c3), self-reported academic performance (c1)
cap drop like*
g like_math_bl =(c2a_bl*(c2a_bl~=99))/(c2a_bl~=99)
g like_scie_bl = (c2d_bl*(c2d_bl~=99) + c2e_bl*(c2e_bl~=99) + c2f_bl*(c2f_bl~=99))/((c2d_bl~=99) + (c2e_bl~=99) + (c2f_bl~=99))
g like_oths_bl = (c2b_bl*(c2b_bl~=99) + c2c_bl*(c2c_bl~=99) + c2g_bl*(c2g_bl~=99) + c2h_bl*(c2h_bl~=99))/((c2b_bl~=99) + (c2c_bl~=99) + (c2g_bl~=99) + (c2h_bl~=99))
g like_all_bl = c2i_bl

g mt_parents = c3c1_bl
lab var mt_parents "talk with parents about stream choice"
g mt_siblings = c3c2_bl
lab var mt_siblings "talk with siblings about stream choice"
g mt_friends = c3c4_bl
lab var mt_friends "talk with friends about stream choice"
g mt_teachers = c3c6_bl
lab var mt_teachers "talk with teachers about stream choice"

g perf_math_bl = c1a_bl/(c1a_bl~=99)
g perf_scie_bl = (c1d_bl*(c1d_bl~=99) + c1e_bl*(c1e_bl~=99) + c1f_bl*(c1f_bl~=99))/((c1d_bl~=99) + (c1e_bl~=99) + (c1f_bl~=99))
g perf_oths_bl = (c1b_bl*(c1b_bl~=99) + c1c_bl*(c1c_bl~=99) + c1g_bl*(c1g_bl~=99) + c1h_bl*(c1h_bl~=99))/((c1b_bl~=99) + (c1c_bl~=99) + (c1g_bl~=99) + (c1h_bl~=99))
g perf_all_bl = c1i_bl



* Personality 
g big5extro_bl = ((5 - h1a_bl)*(h1a_bl~=99) + h1c_bl*(h1c_bl~=99) + h1i_bl*(h1i_bl~=99))/((h1a_bl~=99) + (h1c_bl~=99) + (h1i_bl~=99))
g big5agree_bl = ((5 - h1l_bl)*(h1l_bl~=99) + h1m_bl*(h1m_bl~=99) + h1n_bl*(h1n_bl~=99))/((h1l_bl~=99) + (h1m_bl~=99) + (h1n_bl~=99))
g big5consci_bl = (h1d_bl*(h1d_bl~=99) + (5 - h1b_bl)*(h1b_bl~=99) + h1o_bl*(h1o_bl~=99))/((h1d_bl~=99) + (h1b_bl~=99) + (h1o_bl~=99))
g big5neuro_bl = (h1e_bl*(h1e_bl~=99) + h1g_bl*(h1g_bl~=99) + (5 - h1h_bl)*(h1h_bl~=99))/((h1e_bl~=99) + (h1g_bl~=99) + (h1h_bl~=99))
g big5open_bl = (h1j_bl*(h1j_bl~=99) + h1k_bl*(h1k_bl~=99) + h1f_bl*(h1f_bl~=99))/((h1j_bl~=99) + (h1k_bl~=99) + (h1f_bl~=99))


* Primary outcome: Preferences on higher secondary education: students' self-reported ranking of educational choices including Arts stream, Commerce stream, Science stream, Rigzhung stream at upper secondary school, Technical and Vocational Education and Training (TVET), and stop education after lower secondary school. 

**** Science 
g i_sci_top1_bl = inlist(e14c_bl, "1")
lab var i_sci_top1_bl "Science as top 1 choice after grade 10 (dummy)"

g i_e6_bl = inlist(e6a_bl, "1", "2") | inlist(e6b_bl, "1", "2") | inlist(e6c_bl, "1", "2")
lab var i_e6_bl "STEM-related college majors as top 2 choice after grade 12 (dummy)"

* Attitude to science and technology: b2a, b2d (zscore)
g attitude_stem_bl = (b2a_bl*(b2a_bl~=99) + b2d_bl*(b2d_bl~=99))/((b2a_bl~=99) + (b2d_bl~=99))	

* Gender bias related to STEM and employment: -b4a, -b4b, -b4c, -b4d, -b5b, -b5c (zscore)
* Higher value means more biased against females in STEM 
g genderbias_bl = (b4a_bl*(b4a_bl~=99) + b4b_bl*(b4b_bl~=99) + b4c_bl*(b4c_bl~=99) + b4d_bl*(b4d_bl~=99) + b5b_bl*(b5b_bl~=99) + b5c_bl*(b5c_bl~=99))/((b4a_bl~=99) + (b4b_bl~=99) + (b4c_bl~=99) + (b4d_bl~=99) + (b5b_bl~=99) + (b5c_bl~=99))

	
* Subjective knowledge on science stream (zscore, e1c, e4c)
g subj_sci_bl = (e1c_bl*(e1c_bl~=99) + e4c_bl*(e4c_bl~=99))/((e1c_bl~=99) + (e4c_bl~=99))

* Expected satisfaction on Science stream: e9

* Beliefs about approval of parents, peers, and society if enrolling in Science stream
cap drop approval* 
g approval_sci_bl = (e11c_bl*(e11c_bl~=99) + e12c_bl*(e12c_bl~=99) + e13c_bl*(e13c_bl~=99))/((e11c_bl~=99) + (e12c_bl~=99) + (e13c_bl~=99))

* Beliefs about peers' preferences on higher secondary education: e16c 
foreach i in f m {
replace e16c_`i'_bl=substr(e16c_`i'_bl, 1, length(e16c_`i'_bl) - 1)
destring e16c_`i'_bl, replace 
}

g e16c_bl = e16c_f_bl if male==0
replace e16c_bl = e16c_m_bl if male==1

* Subjective assessment of ability to enroll (e7) and complete (e8) Science stream
cap drop ability 
g ability_bl = (e7c_bl*(e7c_bl~=99) + e8c_bl*(e8c_bl~=99))/((e7c_bl~=99) + (e8c_bl~=99))

keep i_sci_top1_bl i_e6_bl like_math_bl like_scie_bl attitude_stem_bl genderbias_bl subj_sci_bl e9c_bl approval_sci_bl e16c_bl ability_bl age_bl male father_edu mother_edu perf_math_bl perf_scie_bl big5extro_bl big5agree_bl big5consci_bl big5neuro_bl big5open_bl

recode e9c_bl (99 = .)

lab var e16c_bl "Belief about peers' preference in science"
lab var ability_bl "Subjective assessment to enroll and complete science stream"
lab var e9c_bl "Expected satisfaction on science stream"
lab var i_sci_top1_bl "Science as top 1 choice after grade 10 (dummy)"
lab var age_bl "Age"
lab var male "Male"
lab var father_edu "Father with high school diploma (dummy)"
lab var mother_edu "Mother with high school diploma (dummy)"
lab var subj_sci_bl "Subjective knowledge on science stream"
lab var approval_sci_bl "Beliefs about approval of parents, friends and society (5-point Likert)"
lab var genderbias_bl "Bias against women in STEM (5-point Likert)"
lab var attitude_stem_bl "Attitude toward science and technology (5-point Likert)"
lab var like_math_bl "Interest in math" 
lab var like_scie_bl "Interest in science subjects"
lab var perf_math_bl "Subjective performance: math"
lab var perf_scie_bl "Subjective performance: science"
lab var big5extro_bl "Big 5 extroversion"
lab var big5agree_bl "Big 5 agreeness"
lab var big5consci_bl "Big 5 conscientiousness"
lab var big5neuro_bl "Big 5 neuroticism"
lab var big5open_bl "Big 5 openness"

save "$clean/comparison_`filename'.dta", replace
}
	